The project is analysing the expenses of the family by using two different bank account transactions. In order to perform this analyses, the transaction of the two bank account, which are used both by the family, have been used. Transactions are xls and csv format. In each file, data keeps differently so in order to merge all data, data cleanng and wrangling were needed. data was collected from ABN AMRO and ING bank web applications.
The project is analysing the expenses of the family by using two different bank account transactions.
What is the descriptive anayses of expenses?
Why money can not be saved?
######################### Below are the libraries that are used in order to perform EDA (Exploratory data analysis).
print("{:#^150}".format(" Importing Necessary Libraries "))
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt #visualisation
mpl.style.use("ggplot")
from os import listdir
from os.path import isfile, join
#from tkinter.filedialog import askdirectory
import time
from tqdm.notebook import tqdm
print("imported!!")
########################################################### Importing Necessary Libraries ############################################################ imported!!
the functions are:
###################################Functions and Methods
print("{:#^150}".format(" Defining Functions and Methods "))
# Definitions-------------
def findMissingData(dataFrame):
missing_data = dataFrame.isnull()
for column in missing_data.columns.values.tolist():
if str(missing_data[column].any()) == "True":
print("-------",column,"-------")
print("***There is MISSING data***")
print(missing_data[column].value_counts())
else:
print("-------",column,"-------")
print("There is NO missing data")
def cleaningProcess(dataFrame): #wrangling process
print("--------> Process starts....")
# duplication remove
print("--------> Removing duplicates")
dataFrame.drop_duplicates(inplace=True)
# make lowercase of the column names
print("--------> Make lowercase of the column names")
dataFrame.columns = map(str.lower, dataFrame.columns)
# remove whitespaces of the column names
print("--------> Removing whitespaces of the column names")
dataFrame.columns = dataFrame.columns.str.replace(' ', '')
# changing date format
print("--------> Changing date column format")
dataFrame["date"] = pd.to_datetime(dataFrame["date"], format="%Y%m%d")
# creating separate day, month and year info
print("--------> Creating separate day, month, year columns")
dataFrame["day"] = dataFrame["date"].dt.day_name()
dataFrame["month"] = dataFrame["date"].dt.month_name()
dataFrame["year"] = dataFrame["date"].dt.year
# sort the data according to date info
print("--------> Sorting data by Date column and reseting index values")
dataFrame.sort_values(["date"], ignore_index=True, inplace=True)
dataFrame.reset_index(drop=True, inplace=True)
print("--------> Process Finished")
return dataFrame
def dayMonthCategoryChange(dataFrame):
''' Converting Day and Month columns datatype to Category and ordered with day and month order'''
#Change Day and Month columns from Object to Categorical and ordered
months_categories = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
dataFrame["month"] = pd.Categorical(dataFrame["month"], categories = months_categories, ordered=True)
days_categories = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dataFrame["day"] = pd.Categorical(dataFrame["day"], categories = days_categories, ordered=True)
print("{:#^150}".format(" done! "))
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
print("Interquartile range is ", iqr)
fence_low = q1-1.5*iqr
print("Lower fence is ",fence_low)
fence_high = q3+1.5*iqr
print("Higher fence is ", fence_high)
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
########################################################### Defining Functions and Methods ########################################################### ####################################################################### done! ########################################################################
######################### Creating Dataframes for each ABN Amro and ING Bank from excel files
print("{:#^150}".format(" Reading Excel Files and Creating Dataframes for both ABN Amro and ING Bank "))
# define the path. this folder contains excel and CSV files that belong to ING and ABN Bank and these files are regulary downloaded from internet banking sites.
mypath = r"D:\00_BACKUP_BY_DRIVE\Bank_expense_analysis\DATA"
files = listdir(mypath)
# define List that keeps every file as dataframes
dataList_abn = []
dataList_ing = []
# define empty dataFrame for both
abnAmro = pd.DataFrame()
ingBank = pd.DataFrame()
# read every file in the folder and append into List. Luckily ABN data is in excel format and ING is in CSV format
for file in tqdm(files):
if ".xls" in file:
dataList_abn.append(pd.read_excel(mypath + "\\" + file))
#print(file, "-------> ABN AMRO")
print("-------> ABN AMRO")
elif ".xlsx" in file:
dataList_abn.append(pd.read_excel(mypath + "\\" + file))
#print(file, "-------> ABN AMRO")
print("-------> ABN AMRO")
elif ".csv" in file:
dataList_ing.append(pd.read_csv(mypath + "\\" + file, decimal = ",", sep = ";"))
#print(file, "-------> ING BANK")
print("-------> ING BANK")
else:
print(file, "--------------->This file is not valid")
# merge every record in every dataframe in the list to the one dataframe
abnAmro = pd.concat(dataList_abn)
ingBank = pd.concat(dataList_ing)
# take copies of both dataframe in order to keep raw data
df_abnAmro = abnAmro.copy()
df_ingBank = ingBank.copy()
# deleting temporary used variables
del(dataList_abn, dataList_ing)
print("Dataframes, df_abnAmro and df_ingBank, have been created")
print("--------------------------------------------------------")
##################################### Reading Excel Files and Creating Dataframes for both ABN Amro and ING Bank #####################################
0%| | 0/25 [00:00<?, ?it/s]
-------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ABN AMRO -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK -------> ING BANK Dataframes, df_abnAmro and df_ingBank, have been created --------------------------------------------------------
##################### Cleaning the data
## ABN AMRO
print("{:#^150}".format(" CLEANING ABN AMRO DATA "))
print("--------> ABN AMRO - removing unnecessary columns")
df_abnAmro.drop(["accountNumber","mutationcode","valuedate"], inplace=True, axis = 1)
# 1. Some columns names are renamed in order to be identical for both dataframe
print("--------> ABN AMRO - renaming columns")
df_abnAmro.rename(columns={"transactiondate":"date",
"startsaldo":"startbalance",
"endsaldo":"endbalance"}, inplace=True)
# 2. Transactions are defined as debit and credit in ING data. ABN data shows the debit as negatif amount and credit as positive amount.
# first all of the records were defined as debit and the records whose amount was greater than 0 were defined as credit.
df_abnAmro["debit_credit"] = "Debit"
df_abnAmro.loc[df_abnAmro["amount"] > 0,["debit_credit"]] = "Credit"
# 3. All negatif values were converted into positive one so calculation and drawing graphs will be useful
df_abnAmro["amount"] = df_abnAmro["amount"].apply(lambda x : x*-1 if x<0 else x*1)
# 4. Cleaning function was applied.
cleaningProcess(df_abnAmro)
#5. 2 variables were added, namely "name" and "bankname"
df_abnAmro["name"] = np.nan
df_abnAmro["bankname"] = "ABN AMRO"
#6. Day and Month variable define as categorical variable.
dayMonthCategoryChange(df_abnAmro)
print("--------> Cleaning ABN AMRO - completed")
# 7. Name column for ABN AMRO is created and filled by using Description Field
# This name list was prepared manually and every name is searching in description variable and if found, it is written in "name" column
nameList = ["Jumbo","Albert Heijn", "Lidl", "Nettorama", "Aldi","Sioux Expats", "KINDERBIJSLAG", "Ozturk Fresh Food","Izmir", "Izzet", "Can Slagerij", "TAMOIL", "Shell", "TINQ", "TotalEnergies", "Total",
"Esso","Action", "Hema", "Praxis", "solow", "Gamma", "Ikea", "Hornbach", "PRIMARK", "FirstXL", "H&M", "Primera", "So-Low", "Wibra", "ETOS", "McDonald's", "the Post", "Oakwood", "la Toscana",
"the Brand", "Arif","Kruidvat", "Holland & Barrett","Decathlon", "Youfone", "REVOLT21", "Chiroprac","B-Fysic Andromeda", "Coolblue","Sioux","brabant water","KPN","Notariskantoor",
"FBTO" , "REAAL" , "ANAC" , "ASR" , "SCHADEVERZEKERING" , "SCHADEV", "bol.com b.v.", "ABN AMRO", "ennatuurlijk b.v.","Q Park", "W.H.W. Volleman - van Geertruy","C&A","Den Ekkerman",
"CCV Slagerij Zeraan","Mad Science Brabant by Sisow", "e. tasel", "Amazon", "PATHE", "gemeente Eindhoven", "City Sport", "INTERTOYS","ov-chipkaart","sge", "Jamin", "SAMSUNG",
"Monkey Town Group","Boshuys", "Corendon","Ziya", "Step by Step", "Enthousiasment", "Expert", "BLOKKER", "ZOOPLUS", "GR-043-D","GR043D","Park","Las Salinas","Kwik Fit","EKC Afbouwmaterialen","Q Park","SPEELGOEDWINKEL"]
for name in tqdm(nameList):
df_abnAmro.loc[df_abnAmro["description"].str.contains(name, case=False), ["name"]] = name.upper()
############################################################### CLEANING ABN AMRO DATA ############################################################### --------> ABN AMRO - removing unnecessary columns --------> ABN AMRO - renaming columns --------> Process starts.... --------> Removing duplicates --------> Make lowercase of the column names --------> Removing whitespaces of the column names --------> Changing date column format --------> Creating separate day, month, year columns --------> Sorting data by Date column and reseting index values --------> Process Finished --------> Cleaning ABN AMRO - completed
0%| | 0/91 [00:00<?, ?it/s]
############################# ING BANK
print("{:#^150}".format(" CLEANING ING BANK DATA "))
print("--------> ING BANK - removing unnecessary columns")
df_ingBank.drop(["Tag","Account","Counterparty","Code", "Transaction type"], inplace=True, axis = 1)
print("--------> ING BANK - renaming columns")
# 1. Some columns names are renamed in order to be identical for both dataframe
df_ingBank.rename(columns={"Amount (EUR)":"amount",
"Resulting balance":"endbalance",
"Name / Description":"name",
"Debit/credit": "debit_credit",
"Notifications":"description"}, inplace=True)
# 2. Cleaning function was applied.
cleaningProcess(df_ingBank)
# 3. start balance was calculated
df_ingBank.loc[df_ingBank["debit_credit"] == "Debit","startbalance"] = (df_ingBank["endbalance"] + df_ingBank["amount"])
df_ingBank.loc[df_ingBank["debit_credit"] == "Credit","startbalance"] = (df_ingBank["endbalance"] - df_ingBank["amount"])
# 4. variable was added, namely bankname"
df_ingBank["bankname"] = "ING BANK"
# 5. Day and Month variable defined as categorical variable.
dayMonthCategoryChange(df_ingBank)
print("--------> Cleaning ING BANK - completed")
############################################################### CLEANING ING BANK DATA ############################################################### --------> ING BANK - removing unnecessary columns --------> ING BANK - renaming columns --------> Process starts.... --------> Removing duplicates --------> Make lowercase of the column names --------> Removing whitespaces of the column names --------> Changing date column format --------> Creating separate day, month, year columns --------> Sorting data by Date column and reseting index values --------> Process Finished --------> Cleaning ING BANK - completed
print("{:#^150}".format(" MERGING ABN AMRO and ING BANK DATA "))
# amounts that transfers from ABN (Debit) to ING (Credit) have been deleted because these values are duplicate. the income values should be unique.
# the IBAN values are shown as XXXX because of the sensitive information
df1 = df_abnAmro.drop(index = df_abnAmro.loc[(df_abnAmro["debit_credit"] == "Debit") & (df_abnAmro["description"].str.contains("NL79INGBXXXXXXXXXX"))].index)
df2 = df_ingBank.drop(index = df_ingBank.loc[(df_ingBank["debit_credit"] == "Credit") & (df_ingBank["description"].str.contains("NL20ABNAXXXXXXXXXX"))].index)
# Variable order of the dataframe was defined
varOrder=["date","day","month","year","name","startbalance","endbalance","debit_credit","amount","description","bankname"]
# Concated both dataframes that both don't have corresponding transactions between ING and ABN AMRO.
# while concated, column order is redefined.
df = pd.concat([df1[varOrder],df2[varOrder]], ignore_index=True)
# records are sorted according to date column and index values are reset.
print("--------> Sorting data by Date column and reseting index values")
df.sort_values(["date"], ignore_index=True, inplace=True)
df.reset_index(drop=True, inplace=True)
print("--------> DataFrame df has been created")
# deleting temporary used variables
del (df1,df2)
######################################################### MERGING ABN AMRO and ING BANK DATA ######################################################### --------> Sorting data by Date column and reseting index values --------> DataFrame df has been created
# LABELING
print("{:#^150}".format(" LABELING STARTED "))
for i in ["description","name"]:
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Land:", case= False)),["label"]] = "Abroad"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Ozturk|Izmir|Izzet|Can Slagerij|CCV Slagerij Zeraan", case= False)),["label"]] = "Turk market"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("TAMOIL|Shell|TINQ|TotalEnergies|Total|Esso|ELAN", case= False)),["label"]] = "Fuel"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Jumbo|Albert Heijn|Lidl|Nettorama|Aldi|KEURSLAGERIJ|Supermarkt|KAUFLAND|Slijterij vd Heyden|Bazar|Holland Kaascentru|KAASHUIS", case= False)),["label"]] = "Grocery"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Action|hema|Praxis|solow|Gamma|Ikea|Hornbach|PRIMARK|FirstXL|H&M|H & M|Primera|So-Low|Wibra|ETOS|C&A|CASA|Nike|BLOKKER|Scapino|Mammut|Hilfiger|Xenos|Expert Veldhoven",
case= False)),["label"]] = "Store"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("McDonald's|Mc Donalds|the Post|Oakwood|la Toscana|the Brand|Arif|Las Salinas|Happy Italy|Buffel|Amigos BBQ|Ziyas Taste|Paviljoen Genneper|Cafetaria|Five Guys|DUTCHHOMEMADE|Jamin|\
RESTAURANT|Stadspaviljoen|Welschap|Burgers|Sushipoint|Boshuys|Ziya|Snackbar|GRAND CAFE",case= False)),["label"]] = "Restaurant"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Kruidvat", case= False)),["label"]] = "Kruidvat"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Holland & Barrett", case= False)),["label"]] = "Holland & Barrett"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Decathlon", case= False)),["label"]] = "Decathlon"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("iDEAL|bol.com", case= False)),["label"]] = "Online Shopping"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Youfone", case= False)),["label"]] = "Youfone"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("TOYS|Bruna|SPEELGOEDWINKEL", case= False)),["label"]] = "Books&Toys"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("NL93INGB0000724779|NL07RABO0123168686|Doga|Step by Step", case= False)),["label"]] = "Courses"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Sioux", case= False)),["label"]] = "Sioux"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("ENNATUURLIJK|ENECO|KPN|brabant water|T-MOBILE", case= False)),["label"]] = "House Fixed Exp."
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("EKC Afbouwmaterialen|WoonEnzo|c. colleij", case= False)),["label"]] = "Furniture"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Notariskantoor", case= False)),["label"]] = "Notary"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("REVOLT21", case= False)),["label"]] = "REVOLUT"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Chiroprac|B-FYSIC|Maxima|NL60INGB0663263891|Apotheek", case= False)),["label"]] = "Health"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("NL32ABNA0455685711", case= False)),["label"]] = "House Rent"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("GR-043-D|gr043d|oostendorp|Kwik Fit|BELASTINGDIENST", case= False)),["label"]] = "Car"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Parking|Q Park|Parkmobile", case= False)),["label"]] = "Car Park"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Sport|Den Ekkerman|Laco|DéDé B.V.|City Spo", case= False)),["label"]] = "Sport"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("FBTO|REAAL|ANAC|ASR|SCHADEVERZEKERING|SCHADEV|Zilveren Kruis", case= False)),["label"]] = "Insurance"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("ING Hypotheken", case= False)),["label"]] = "Mortgage"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Kosten", case= False)),["label"]] = "ING Bank"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Basic Package", case= False)),["label"]] = "ABNAMRO Bank"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("OV-Chipkaart|NSAutomaat", case= False)),["label"]] = "Transportation"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Corendon|Airport", case= False)),["label"]] = "Airplane"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Geldmaat",case= False)),["label"]] = "Money Withdraw"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Tikkie|UCAK|ERKAN|Isiksal|Filoglu",case= False)),["label"]] = "Money Transfer"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Cinema|Pathe|Efteling|Toverland|Laco|Klimcentrum",case= False)),["label"]] = "Cinema & Fun"
df.loc[(df["debit_credit"]=="Debit") & (df[i].str.contains("Camping",case= False)),["label"]] = "Holiday"
df.loc[(df["debit_credit"]=="Credit"),["label"]] = "Payback"
df.loc[(df["debit_credit"]=="Credit") & (df["description"].str.contains("Sioux Expats", case= False)),["label"]] = "Income"
df.loc[(df["debit_credit"]=="Credit") & (df["description"].str.contains("KINDERBIJSLAG", case = False)),["label"]] = "Kinderbijslag"
findMissingData(df)
df.loc[df["label"].isna(),["label"]] = "Other Expenses"
df.loc[df["name"].isna(),["name"]] = "Other"
print(df["label"].unique())
findMissingData(df)
print("{:#^150}".format(" LABELING FINISHED "))
################################################################## LABELING STARTED ################################################################## ------- date ------- There is NO missing data ------- day ------- There is NO missing data ------- month ------- There is NO missing data ------- year ------- There is NO missing data ------- name ------- ***There is MISSING data*** False 1775 True 434 Name: name, dtype: int64 ------- startbalance ------- There is NO missing data ------- endbalance ------- There is NO missing data ------- debit_credit ------- There is NO missing data ------- amount ------- There is NO missing data ------- description ------- There is NO missing data ------- bankname ------- There is NO missing data ------- label ------- ***There is MISSING data*** False 1997 True 212 Name: label, dtype: int64 ['Store' 'Grocery' 'Restaurant' 'Furniture' 'Abroad' 'Insurance' 'Health' 'Online Shopping' 'Other Expenses' 'Youfone' 'Sioux' 'Income' 'Fuel' 'Transportation' 'ABNAMRO Bank' 'Car' 'House Rent' 'Kruidvat' 'Cinema & Fun' 'Decathlon' 'Turk market' 'Sport' 'Payback' 'Notary' 'Courses' 'Car Park' 'Money Transfer' 'Kinderbijslag' 'House Fixed Exp.' 'Money Withdraw' 'Mortgage' 'ING Bank' 'Holland & Barrett' 'Books&Toys' 'Airplane' 'REVOLUT' 'Holiday'] ------- date ------- There is NO missing data ------- day ------- There is NO missing data ------- month ------- There is NO missing data ------- year ------- There is NO missing data ------- name ------- There is NO missing data ------- startbalance ------- There is NO missing data ------- endbalance ------- There is NO missing data ------- debit_credit ------- There is NO missing data ------- amount ------- There is NO missing data ------- description ------- There is NO missing data ------- bankname ------- There is NO missing data ------- label ------- There is NO missing data ################################################################# LABELING FINISHED ##################################################################
df.to_csv(r"D:\00_BACKUP_BY_DRIVE\Bank_expense_analysis\expenses.csv",index=True)
# find number of months in each year
months = dict(df.groupby("year")["month"].unique())
print(months)
numberofMonths = dict()
for i in months:
numberofMonths[i] = len(months[i])
print(numberofMonths)
numberofmonths_df = pd.DataFrame({"year":numberofMonths.keys(),"numberofmonths":numberofMonths.values()}).set_index("year")
numberofmonths_df
{2021: ['July', 'August', 'September', 'October', 'November', 'December']
Categories (12, object): ['January' < 'February' < 'March' < 'April' ... 'September' < 'October' < 'November' < 'December'], 2022: ['January', 'February', 'March', 'April', 'May', ..., 'August', 'September', 'October', 'November', 'December']
Length: 12
Categories (12, object): ['January' < 'February' < 'March' < 'April' ... 'September' < 'October' < 'November' < 'December'], 2023: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August']
Categories (12, object): ['January' < 'February' < 'March' < 'April' ... 'September' < 'October' < 'November' < 'December']}
{2021: 6, 2022: 12, 2023: 8}
| numberofmonths | |
|---|---|
| year | |
| 2021 | 6 |
| 2022 | 12 |
| 2023 | 8 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2209 entries, 0 to 2208 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 2209 non-null datetime64[ns] 1 day 2209 non-null category 2 month 2209 non-null category 3 year 2209 non-null int64 4 name 2209 non-null object 5 startbalance 2209 non-null float64 6 endbalance 2209 non-null float64 7 debit_credit 2209 non-null object 8 amount 2209 non-null float64 9 description 2209 non-null object 10 bankname 2209 non-null object 11 label 2209 non-null object dtypes: category(2), datetime64[ns](1), float64(3), int64(1), object(5) memory usage: 177.8+ KB
df.shape
(2209, 12)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| year | 2209.0 | 2022.106836 | 0.712462 | 2021.00 | 2022.00 | 2022.00 | 2023.00 | 2023.00 |
| startbalance | 2209.0 | 6625.704640 | 8243.813899 | 17.73 | 2791.17 | 4784.02 | 6583.77 | 41860.21 |
| endbalance | 2209.0 | 6615.868728 | 8245.178905 | 8.77 | 2518.54 | 4784.02 | 6628.67 | 41860.21 |
| amount | 2209.0 | 148.537669 | 804.652742 | 0.01 | 9.25 | 26.00 | 60.44 | 24704.72 |
df.nunique()
date 648 day 7 month 12 year 3 name 298 startbalance 2187 endbalance 2188 debit_credit 2 amount 1231 description 2143 bankname 2 label 37 dtype: int64
findMissingData(df)
------- date ------- There is NO missing data ------- day ------- There is NO missing data ------- month ------- There is NO missing data ------- year ------- There is NO missing data ------- name ------- There is NO missing data ------- startbalance ------- There is NO missing data ------- endbalance ------- There is NO missing data ------- debit_credit ------- There is NO missing data ------- amount ------- There is NO missing data ------- description ------- There is NO missing data ------- bankname ------- There is NO missing data ------- label ------- There is NO missing data
df_debit= df.loc[df["debit_credit"]=="Debit"]
df_debit
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-07-09 | Friday | July | 2021 | ACTION | 34838.52 | 34817.84 | Debit | 20.68 | BEA NR:72327610 09.07.21/13.26 Action 1235,P... | ABN AMRO | Store |
| 1 | 2021-07-09 | Friday | July | 2021 | JUMBO | 34839.01 | 34838.52 | Debit | 0.49 | BEA NR:N179K7 09.07.21/12.55 Jumbo Eindh B... | ABN AMRO | Grocery |
| 2 | 2021-07-09 | Friday | July | 2021 | JUMBO | 34796.68 | 34786.84 | Debit | 9.84 | BEA NR:4QKP08 09.07.21/20.32 Jumbo 229276,... | ABN AMRO | Grocery |
| 3 | 2021-07-09 | Friday | July | 2021 | JUMBO | 34817.84 | 34804.97 | Debit | 12.87 | BEA NR:N179K7 09.07.21/13.52 Jumbo Eindh B... | ABN AMRO | Grocery |
| 4 | 2021-07-09 | Friday | July | 2021 | LA TOSCANA | 34804.97 | 34799.37 | Debit | 5.60 | BEA NR:90168064 09.07.21/16.41 LA TOSCANA IJ... | ABN AMRO | Restaurant |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2204 | 2023-08-28 | Monday | August | 2023 | Het Strandhuys Bergen NLD | 4035.30 | 3929.35 | Debit | 105.95 | Card sequence no.: 001 27/08/2023 17:34 Transa... | ING BANK | Store |
| 2205 | 2023-08-29 | Tuesday | August | 2023 | NETTORAMA 4 EINDHOVEN NLD | 3885.81 | 3863.70 | Debit | 22.11 | Card sequence no.: 001 28/08/2023 20:39 Transa... | ING BANK | Grocery |
| 2206 | 2023-08-29 | Tuesday | August | 2023 | Sioux Eindhoven EINDHOVEN NLD | 3863.70 | 3861.70 | Debit | 2.00 | Card sequence no.: 002 28/08/2023 11:52 Transa... | ING BANK | Sioux |
| 2207 | 2023-08-30 | Wednesday | August | 2023 | Meat UPP EINDHOVEN NLD | 3861.70 | 3842.02 | Debit | 19.68 | Card sequence no.: 001 29/08/2023 15:33 Transa... | ING BANK | Store |
| 2208 | 2023-08-30 | Wednesday | August | 2023 | Sioux Eindhoven EINDHOVEN NLD | 3842.02 | 3840.02 | Debit | 2.00 | Card sequence no.: 002 29/08/2023 12:03 Transa... | ING BANK | Sioux |
2100 rows × 12 columns
df_debit.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| year | 2100.0 | 2022.105238 | 0.715893 | 2021.00 | 2022.0000 | 2022.000 | 2023.0000 | 2023.00 |
| startbalance | 2100.0 | 6688.739790 | 8310.719076 | 17.73 | 2787.2900 | 4803.130 | 6640.3100 | 41860.21 |
| endbalance | 2100.0 | 6605.442829 | 8287.388992 | 8.77 | 2461.4275 | 4775.915 | 6573.9575 | 41856.01 |
| amount | 2100.0 | 83.296962 | 575.949391 | 0.01 | 8.9900 | 25.000 | 59.9250 | 24704.72 |
plt.figure(figsize=(10,8))
sns.histplot(df_debit["amount"], bins=50)
plt.title("Distribution of the expense amounts")
plt.show()
sns.boxplot(data=df_debit, y="amount")
<Axes: ylabel='amount'>
df_debit.sort_values(by=["amount"],ascending=False).head(5)
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 124 | 2021-09-17 | Friday | September | 2021 | NOTARISKANTOOR | 38463.69 | 13758.97 | Debit | 24704.72 | SEPA Overboeking IBAN: NL37RAB... | ABN AMRO | Notary |
| 309 | 2021-11-13 | Saturday | November | 2021 | Other | 8359.75 | 5265.75 | Debit | 3094.00 | SEPA Overboeking IBAN: NL94ABN... | ABN AMRO | Money Transfer |
| 1164 | 2022-09-01 | Thursday | September | 2022 | REVOLT21 | 8113.34 | 5613.34 | Debit | 2500.00 | SEPA Overboeking IBAN: LT02325... | ABN AMRO | REVOLUT |
| 877 | 2022-05-26 | Thursday | May | 2022 | REVOLT21 | 8172.31 | 6172.31 | Debit | 2000.00 | SEPA Overboeking IBAN: LT02325... | ABN AMRO | REVOLUT |
| 287 | 2021-11-04 | Thursday | November | 2021 | ING Hypotheken | 1745.32 | 141.70 | Debit | 1603.62 | Name: ING Hypotheken Description: NOTA INZAKE ... | ING BANK | Mortgage |
df_debit.loc[df_debit["amount"]==df_debit["amount"].max()].index[0]
124
df_debit.drop([df_debit.loc[df_debit["amount"]==df_debit["amount"].max()].index[0],],axis=0,inplace=True)
C:\Users\Erdinc\AppData\Local\Temp\ipykernel_1992\233275231.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_debit.drop([df_debit.loc[df_debit["amount"]==df_debit["amount"].max()].index[0],],axis=0,inplace=True)
df_debit.sort_values(by=["amount"],ascending=False).head(5)
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 309 | 2021-11-13 | Saturday | November | 2021 | Other | 8359.75 | 5265.75 | Debit | 3094.00 | SEPA Overboeking IBAN: NL94ABN... | ABN AMRO | Money Transfer |
| 1164 | 2022-09-01 | Thursday | September | 2022 | REVOLT21 | 8113.34 | 5613.34 | Debit | 2500.00 | SEPA Overboeking IBAN: LT02325... | ABN AMRO | REVOLUT |
| 877 | 2022-05-26 | Thursday | May | 2022 | REVOLT21 | 8172.31 | 6172.31 | Debit | 2000.00 | SEPA Overboeking IBAN: LT02325... | ABN AMRO | REVOLUT |
| 287 | 2021-11-04 | Thursday | November | 2021 | ING Hypotheken | 1745.32 | 141.70 | Debit | 1603.62 | Name: ING Hypotheken Description: NOTA INZAKE ... | ING BANK | Mortgage |
| 150 | 2021-09-24 | Friday | September | 2021 | Other | 13003.91 | 11503.91 | Debit | 1500.00 | SEPA Overboeking IBAN: NL55RAB... | ABN AMRO | Furniture |
df_debit.shape
(2099, 12)
sns.boxplot(data=df_debit, x="amount", orient="h")
<Axes: xlabel='amount'>
df_debit_clean=remove_outlier(df_debit,"amount")
Interquartile range is 50.895 Lower fence is -67.3525 Higher fence is 136.22750000000002
df_debit_clean
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-07-09 | Friday | July | 2021 | ACTION | 34838.52 | 34817.84 | Debit | 20.68 | BEA NR:72327610 09.07.21/13.26 Action 1235,P... | ABN AMRO | Store |
| 1 | 2021-07-09 | Friday | July | 2021 | JUMBO | 34839.01 | 34838.52 | Debit | 0.49 | BEA NR:N179K7 09.07.21/12.55 Jumbo Eindh B... | ABN AMRO | Grocery |
| 2 | 2021-07-09 | Friday | July | 2021 | JUMBO | 34796.68 | 34786.84 | Debit | 9.84 | BEA NR:4QKP08 09.07.21/20.32 Jumbo 229276,... | ABN AMRO | Grocery |
| 3 | 2021-07-09 | Friday | July | 2021 | JUMBO | 34817.84 | 34804.97 | Debit | 12.87 | BEA NR:N179K7 09.07.21/13.52 Jumbo Eindh B... | ABN AMRO | Grocery |
| 4 | 2021-07-09 | Friday | July | 2021 | LA TOSCANA | 34804.97 | 34799.37 | Debit | 5.60 | BEA NR:90168064 09.07.21/16.41 LA TOSCANA IJ... | ABN AMRO | Restaurant |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2204 | 2023-08-28 | Monday | August | 2023 | Het Strandhuys Bergen NLD | 4035.30 | 3929.35 | Debit | 105.95 | Card sequence no.: 001 27/08/2023 17:34 Transa... | ING BANK | Store |
| 2205 | 2023-08-29 | Tuesday | August | 2023 | NETTORAMA 4 EINDHOVEN NLD | 3885.81 | 3863.70 | Debit | 22.11 | Card sequence no.: 001 28/08/2023 20:39 Transa... | ING BANK | Grocery |
| 2206 | 2023-08-29 | Tuesday | August | 2023 | Sioux Eindhoven EINDHOVEN NLD | 3863.70 | 3861.70 | Debit | 2.00 | Card sequence no.: 002 28/08/2023 11:52 Transa... | ING BANK | Sioux |
| 2207 | 2023-08-30 | Wednesday | August | 2023 | Meat UPP EINDHOVEN NLD | 3861.70 | 3842.02 | Debit | 19.68 | Card sequence no.: 001 29/08/2023 15:33 Transa... | ING BANK | Store |
| 2208 | 2023-08-30 | Wednesday | August | 2023 | Sioux Eindhoven EINDHOVEN NLD | 3842.02 | 3840.02 | Debit | 2.00 | Card sequence no.: 002 29/08/2023 12:03 Transa... | ING BANK | Sioux |
1940 rows × 12 columns
df_debit_outliers= pd.concat([df_debit,df_debit_clean]).drop_duplicates(keep=False)
df_debit_outliers
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 2021-07-10 | Saturday | July | 2021 | Other | 34786.84 | 34431.84 | Debit | 355.00 | BEA NR:EP001867 10.07.21/15.50 WoonEnzo BV,P... | ABN AMRO | Furniture |
| 44 | 2021-07-27 | Tuesday | July | 2021 | FBTO | 38650.11 | 38449.11 | Debit | 201.00 | SEPA Incasso algemeen doorlopend Incassant: NL... | ABN AMRO | Insurance |
| 45 | 2021-08-02 | Monday | August | 2021 | E. TASEL | 38333.51 | 37038.51 | Debit | 1295.00 | SEPA Overboeking IBAN: NL32ABN... | ABN AMRO | House Rent |
| 64 | 2021-08-27 | Friday | August | 2021 | FBTO | 41635.64 | 41434.64 | Debit | 201.00 | SEPA Incasso algemeen doorlopend Incassant: NL... | ABN AMRO | Insurance |
| 75 | 2021-09-01 | Wednesday | September | 2021 | E. TASEL | 41027.35 | 39732.35 | Debit | 1295.00 | SEPA Overboeking IBAN: NL32ABN... | ABN AMRO | House Rent |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2137 | 2023-07-27 | Thursday | July | 2023 | PARK | 9238.19 | 9018.20 | Debit | 219.99 | /TRTP/iDEAL/IBAN/NL51DEUT0265262461/BIC/DEUTNL... | ABN AMRO | Car Park |
| 2154 | 2023-08-01 | Tuesday | August | 2023 | Zilveren Kruis Zorgverzekeringen NV | 2391.08 | 2104.73 | Debit | 286.35 | Name: Zilveren Kruis Zorgverzekeringen NV Des... | ING BANK | Insurance |
| 2160 | 2023-08-01 | Tuesday | August | 2023 | ING Hypotheken | 3870.97 | 2391.08 | Debit | 1479.89 | Name: ING Hypotheken Description: NOTA INZAKE ... | ING BANK | Mortgage |
| 2168 | 2023-08-18 | Friday | August | 2023 | Pension De Kattenboom LIESSEL | 1860.35 | 1664.85 | Debit | 195.50 | Card sequence no.: 001 18/08/2023 09:08 Transa... | ING BANK | Store |
| 2192 | 2023-08-25 | Friday | August | 2023 | ENNATUURLIJK BV | 1166.78 | 903.14 | Debit | 263.64 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
159 rows × 12 columns
df_debit_clean.shape
(1940, 12)
df_debit_outliers.shape
(159, 12)
df_debit.shape
(2099, 12)
df.shape
(2209, 12)
df_debit_clean[["amount"]].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| amount | 1940.0 | 32.336175 | 30.12816 | 0.01 | 8.0 | 21.88 | 51.71 | 135.49 |
fig = plt.figure(figsize=(15,10)) # create figure
row_colum=[2,2]
ax0 = fig.add_subplot(row_colum[0], row_colum[1], 1)
ax1 = fig.add_subplot(row_colum[0], row_colum[1], 2)
ax2 = fig.add_subplot(row_colum[0], row_colum[1], 3)
ax3 = fig.add_subplot(row_colum[0], row_colum[1], 4)
ax0.set_title("Expense distribution")
sns.boxplot(data=df_debit_clean, x="amount", ax=ax0, orient="h")
ax1.set_title("According to Year")
sns.boxplot(data=df_debit_clean, x="amount", y="year", ax=ax1, orient="h")
ax2.set_title("According to Month")
sns.boxplot(data=df_debit_clean, x="amount", y="month", ax=ax2, orient="h")
ax3.set_title("According to Day")
sns.boxplot(data=df_debit_clean, x="amount", y="day", ax=ax3, orient="h")
plt.show()
df_debit_clean.sort_values(by=["amount"],ascending=False).head(5)
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 211 | 2021-10-08 | Friday | October | 2021 | GAMMA | 12212.41 | 12076.92 | Debit | 135.49 | BEA NR:96BS02 08.10.21/09.56 GAMMA Son en ... | ABN AMRO | Store |
| 921 | 2022-06-05 | Sunday | June | 2022 | Figure Skating Academy The Netherlands | 895.93 | 760.93 | Debit | 135.00 | Name: Figure Skating Academy The Netherlands D... | ING BANK | Courses |
| 1625 | 2023-02-05 | Sunday | February | 2023 | PRAXIS | 6206.54 | 6072.57 | Debit | 133.97 | BEA, Betaalpas Praxis 322 Ei... | ABN AMRO | Store |
| 1480 | 2022-12-17 | Saturday | December | 2022 | Other | 3471.85 | 3338.85 | Debit | 133.00 | BEA, Betaalpas CCV*WYNWOOD B... | ABN AMRO | Other Expenses |
| 179 | 2021-09-30 | Thursday | September | 2021 | PRAXIS | 14618.18 | 14487.46 | Debit | 130.72 | BEA NR:AE4802 30.09.21/12.00 PRAXIS EINDHO... | ABN AMRO | Store |
#colors = sns.color_palette('bright')[0:3]
pie_chart_df = df_debit_clean.groupby(["label"])[["amount"]].sum("amount").sort_values(by="amount", ascending=False).head(10)
pie_chart_df.plot(kind = "pie",
y="amount",
figsize=[8,8],
labels = pie_chart_df.index, # turn off labels on pie chart
autopct='%0.01f%%',
shadow=False,
pctdistance=0.9, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
#explode=[0.1,0,0,0,0,0,0],
legend=False,
ylabel="",
title="The Top 10 Expenses Which Have No Outliers")
<Axes: title={'center': 'The Top 10 Expenses Which Have No Outliers'}>
df_list=dict()
nrow=1
ncol=3
# make a list of all dataframes
for i in df_debit["year"].unique():
df_list[i]= df_debit_clean.loc[(df_debit_clean["year"]==i)].groupby(["label"]).sum("amount").sort_values(by="amount", ascending=False).head(10)["amount"]
fig, axes = plt.subplots(nrow, ncol,figsize=(15,15),sharey=False, sharex=True)
fig.suptitle(t="The Top 10 Expenses of ", y=0.65,fontsize=16)
# plot counter
count=list(df_list.keys())[0]
for r in range(nrow):
for c in range(ncol):
df_list[count].plot(kind = "pie",
y="amount",
#figsize=[8,8],
labels = df_list[count].index, # turn off labels on pie chart
autopct='%0.01f%%',
shadow=False,
pctdistance=0.7, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
#explode=[0.1,0,0,0,0,0,0],
legend=False,
ylabel="",
ax=axes[c])
#title="The Top 10 Expenses Which Have No Outliers")
title= str(count) + " (Outliers Removed)"
axes[c].set_title(title)
axes[c].set_ylabel('')
count+=1
plt.figure(figsize=(10,8))
colors = sns.color_palette('bright')[0:3]
sns.histplot(df_debit_clean, x="amount", hue="year", bins=20, kde=True, palette=colors, multiple="dodge")
plt.title("Distribution of the expenses (Outliers Removed)")
plt.show()
fig = plt.figure() # create figure
row_colum=[3,1]
ax0 = fig.add_subplot(row_colum[0], row_colum[1], 1)
ax1 = fig.add_subplot(row_colum[0], row_colum[1], 2)
ax2 = fig.add_subplot(row_colum[0], row_colum[1], 3)
figsize=(10,10)
################ ax0
df_query= df_debit_clean.loc[(df_debit_clean["year"]==2021)].groupby(["label"]).sum("amount").sort_values(by="amount", ascending=False).head(10)
df_query["amount"].plot(kind="barh",figsize=figsize, ax= ax0)
for index, value in enumerate(df_query["amount"]):
#label=format(int(value),",")
ax0.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.2), color="black")
ax0.set_title("The Top 10 Expenses of 2021")
ax0.set_ylabel('')
###############
df_query= df_debit_clean.loc[(df_debit_clean["year"]==2022)].groupby(["label"]).sum("amount").sort_values(by="amount", ascending=False).head(10)
df_query["amount"].plot(kind="barh",figsize=figsize, ax= ax1)
for index, value in enumerate(df_query["amount"]):
#label=format(int(value),",")
ax1.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.2), color="black")
ax1.set_title("The Top 10 Expenses of 2022")
ax1.set_ylabel('')
######################
df_query= df_debit_clean.loc[(df_debit_clean["year"]==2023)].groupby(["label"]).sum("amount").sort_values(by="amount", ascending=False).head(10)
df_query["amount"].plot(kind="barh",figsize=figsize, ax= ax2)
for index, value in enumerate(df_query["amount"]):
#label=format(int(value),",")
ax2.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.2), color="black")
ax2.set_title("The Top 10 Expenses of 2023")
ax2.set_ylabel('')
Text(0, 0.5, '')
#define number of rows and columns for subplots
df_list=dict()
nrow=3
ncol=1
# make a list of all dataframes
for i in df_debit["year"].unique():
df_list[i]= df_debit_clean.loc[(df_debit_clean["year"]==i)].groupby(["label"]).sum("amount").sort_values(by="amount", ascending=True).tail(10)["amount"]
fig, axes = plt.subplots(nrow, ncol,figsize=(10,10),sharey=False, sharex=True)
# plot counter
count=list(df_list.keys())[0]
for r in range(nrow):
for c in range(ncol):
df_list[count].plot(kind="barh",ax=axes[r])
for index, value in enumerate(pd.DataFrame(df_list[count])["amount"]):
axes[r].annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.2), color="black")
title= "The Top 10 Expenses of " + str(count) + " (Outliers Removed)"
axes[r].set_title(title)
axes[r].set_ylabel('')
count+=1
sum_clean_expenses=pd.concat([df_debit_clean.groupby(["year"]).sum("amount"),numberofmonths_df],axis=1)
sum_clean_expenses["averagedebit"]=sum_clean_expenses["amount"]/sum_clean_expenses["numberofmonths"]
#print("%.2f"%sum_expenses["averagedebit"][2022])
sum_clean_expenses["averagedebit"].plot(kind="barh")
sum_clean_expenses
| startbalance | endbalance | amount | numberofmonths | averagedebit | |
|---|---|---|---|---|---|
| year | |||||
| 2021 | 6644407.14 | 6632555.61 | 11851.53 | 6 | 1975.255000 |
| 2022 | 4804081.86 | 4773502.15 | 30579.71 | 12 | 2548.309167 |
| 2023 | 1574795.29 | 1554494.35 | 20300.94 | 8 | 2537.617500 |
df_debit_outliers[["amount"]].describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| amount | 159.0 | 550.230943 | 554.591932 | 137.65 | 175.03 | 267.46 | 816.475 | 3094.0 |
fig = plt.figure(figsize=(15,10)) # create figure
row_colum=[2,2]
ax0 = fig.add_subplot(row_colum[0], row_colum[1], 1)
ax1 = fig.add_subplot(row_colum[0], row_colum[1], 2)
ax2 = fig.add_subplot(row_colum[0], row_colum[1], 3)
ax3 = fig.add_subplot(row_colum[0], row_colum[1], 4)
ax0.set_title("Expense distribution")
sns.boxplot(data=df_debit_outliers, x="amount", ax=ax0, orient="h")
ax1.set_title("According to Year")
sns.boxplot(data=df_debit_outliers, x="amount", y="year", ax=ax1, orient="h")
ax2.set_title("According to Month")
sns.boxplot(data=df_debit_outliers, x="amount", y="month", ax=ax2, orient="h")
ax3.set_title("According to Day")
sns.boxplot(data=df_debit_outliers, x="amount", y="day", ax=ax3, orient="h")
plt.show()
df_debit_outliers.sort_values(by=["amount"],ascending=False)
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 309 | 2021-11-13 | Saturday | November | 2021 | Other | 8359.75 | 5265.75 | Debit | 3094.00 | SEPA Overboeking IBAN: NL94ABN... | ABN AMRO | Money Transfer |
| 1164 | 2022-09-01 | Thursday | September | 2022 | REVOLT21 | 8113.34 | 5613.34 | Debit | 2500.00 | SEPA Overboeking IBAN: LT02325... | ABN AMRO | REVOLUT |
| 877 | 2022-05-26 | Thursday | May | 2022 | REVOLT21 | 8172.31 | 6172.31 | Debit | 2000.00 | SEPA Overboeking IBAN: LT02325... | ABN AMRO | REVOLUT |
| 287 | 2021-11-04 | Thursday | November | 2021 | ING Hypotheken | 1745.32 | 141.70 | Debit | 1603.62 | Name: ING Hypotheken Description: NOTA INZAKE ... | ING BANK | Mortgage |
| 150 | 2021-09-24 | Friday | September | 2021 | Other | 13003.91 | 11503.91 | Debit | 1500.00 | SEPA Overboeking IBAN: NL55RAB... | ABN AMRO | Furniture |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1422 | 2022-11-25 | Friday | November | 2022 | ENNATUURLIJK BV | 1556.32 | 1418.67 | Debit | 137.65 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
| 1088 | 2022-07-25 | Monday | July | 2022 | ENNATUURLIJK BV | 1025.83 | 888.18 | Debit | 137.65 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
| 1338 | 2022-10-25 | Tuesday | October | 2022 | ENNATUURLIJK BV | 1360.06 | 1222.41 | Debit | 137.65 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
| 1251 | 2022-09-26 | Monday | September | 2022 | ENNATUURLIJK BV | 1113.85 | 976.20 | Debit | 137.65 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
| 1125 | 2022-08-25 | Thursday | August | 2022 | ENNATUURLIJK BV | 1146.84 | 1009.19 | Debit | 137.65 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
159 rows × 12 columns
plt.figure(figsize=(10,8))
colors = sns.color_palette('bright')[0:3]
sns.histplot(df_debit_outliers, x="amount", hue="year", bins=20, kde=True, palette=colors, multiple="dodge")
plt.title("Distribution of the expenses outliers")
plt.show()
#colors = sns.color_palette('bright')[0:3]
pie_chart_df = df_debit_outliers.groupby(["label"])[["amount"]].sum("amount").sort_values(by="amount", ascending=False).head(10)
pie_chart_df.plot(kind = "pie",
y="amount",
figsize=[8,8],
labels = pie_chart_df.index, # turn off labels on pie chart
autopct='%0.01f%%',
shadow=False,
pctdistance=0.9, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
#explode=[0.1,0,0,0,0,0,0],
legend=False,
ylabel="",
title="The Top 10 expenses of outliers")
<Axes: title={'center': 'The Top 10 expenses of outliers'}>
sns.boxplot(data=df_debit_outliers, x="year", y="amount")
<Axes: xlabel='year', ylabel='amount'>
sum_outlier_expenses=pd.concat([df_debit_outliers.groupby(["year"]).sum("amount"),numberofmonths_df],axis=1)
sum_outlier_expenses["averagedebit"]=sum_outlier_expenses["amount"]/sum_outlier_expenses["numberofmonths"]
#print("%.2f"%sum_expenses["averagedebit"][2022])
sum_outlier_expenses["averagedebit"].plot(kind="barh")
sum_outlier_expenses
| startbalance | endbalance | amount | numberofmonths | averagedebit | |
|---|---|---|---|---|---|
| year | |||||
| 2021 | 515176.84 | 494238.65 | 20938.19 | 6 | 3489.698333 |
| 2022 | 267547.07 | 227321.54 | 40225.53 | 12 | 3352.127500 |
| 2023 | 201881.67 | 175558.67 | 26323.00 | 8 | 3290.375000 |
The average yearly expenses according to month is yearly increasing
fig = plt.figure() # create figure
row_colum=[4,2]
ax0 = fig.add_subplot(row_colum[0], row_colum[1], 1) # add subplot 1 (1 row, 2 columns, first plot)
ax1 = fig.add_subplot(row_colum[0], row_colum[1], 2) # add subplot 2 (1 row, 2 columns, second plot). See tip below**
ax2 = fig.add_subplot(row_colum[0], row_colum[1], 3)
ax3 = fig.add_subplot(row_colum[0], row_colum[1], 4)
ax4 = fig.add_subplot(row_colum[0], row_colum[1], 5)
ax5 = fig.add_subplot(row_colum[0], row_colum[1], 6)
ax6 = fig.add_subplot(row_colum[0], row_colum[1], 7)
ax7 = fig.add_subplot(row_colum[0], row_colum[1], 8)
figsize=(20,25)
################ ax0
df["label"].value_counts().to_frame().plot(kind="barh",figsize= figsize,ax=ax0)
ax0.set_title("Number of expenses according to the Label")
ax0.set_ylabel('')
############### ax1
df_debit= df.loc[(df["year"]==2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum().reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot= df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, xlim=(0,7500), ax= ax1)
for index, value in enumerate(df_barplot[2022]):
#label=format(int(value),",")
ax1.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
ax1.set_title("Sum of expenses in 2022 except Mortgage")
ax1.set_ylabel('')
############### ax2
pie_chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] !="Mortgage")].groupby(["day"])[["amount"]].sum()
pie_chart_df["amount"].plot(kind = "pie",
figsize=figsize,
labels = None, # turn off labels on pie chart
autopct='%0.02f%%',
shadow=True,
pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
explode=[0.1,0,0,0,0,0,0],
ax= ax2
)
# scale the title up by 12% to match pctdistance
#ax2.title('XXXX', y=1.1)
# add legend
ax2.legend(labels=pie_chart_df.index, loc='upper left')
ax2.set_title("Sum of expenses for each day in 2022")
ax2.set_ylabel('')
#################### ax3
df_debit= df.loc[(df["year"]==2023) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum().reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot= df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, xlim=(0,7500), ax= ax3)
for index, value in enumerate(df_barplot[2023]):
#label=format(int(value),",")
ax3.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
ax3.set_title("Sum of expenses in 2023 except Mortgage")
ax3.set_ylabel('')
#################### ax4
chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] =="Grocery")].groupby(["day"])[["amount"]].sum()
chart_df["amount"].plot(kind = "pie",
figsize=figsize,
labels = chart_df["amount"].round(2), # turn off labels on pie chart
#autopct='%0.02f%%',
shadow=True,
pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
explode=[0.1,0,0,0,0,0,0],# 'explode' lowest 3 continents
ax=ax4
)
# scale the title up by 12% to match pctdistance
ax4.set_title("Sum of Grocery expenses in 2022")
# add legend
ax4.legend(labels=chart_df.index, loc='upper left')
ax4.set_ylabel('')
########################### ax5
df_debit= df.loc[(df["year"]>=2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum().reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot=df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, ax=ax5)
for index, value in enumerate(df_barplot[2022]):
#label=format(int(value),",")
ax5.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
for index, value in enumerate(df_barplot[2023]):
#label=format(int(value),",")
ax5.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index), color="black")
#################################### ax6
sum_expenses=pd.concat([df.loc[df["debit_credit"]=="Debit"].groupby(["year"]).sum(),numberofmonths_df],axis=1)
sum_expenses["averagedebit"]=sum_expenses["amount"]/sum_expenses["numberofmonths"]
#print("%.2f"%wm["averagedebit"][2022])
sum_expenses["averagedebit"].plot(kind="barh", figsize=figsize, ax=ax6)
for index, value in enumerate(sum_expenses["averagedebit"]):
#label=format(int(value),",")
ax6.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index), color="black")
ax6.set_title("Average expenses")
plt.show()
C:\Users\Erdinc\AppData\Local\Temp\ipykernel_1992\782940648.py:17: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
df_debit= df.loc[(df["year"]==2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum().reset_index("year")
C:\Users\Erdinc\AppData\Local\Temp\ipykernel_1992\782940648.py:48: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
df_debit= df.loc[(df["year"]==2023) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum().reset_index("year")
C:\Users\Erdinc\AppData\Local\Temp\ipykernel_1992\782940648.py:80: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
df_debit= df.loc[(df["year"]>=2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum().reset_index("year")
C:\Users\Erdinc\AppData\Local\Temp\ipykernel_1992\782940648.py:93: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sum_expenses=pd.concat([df.loc[df["debit_credit"]=="Debit"].groupby(["year"]).sum(),numberofmonths_df],axis=1)
fig = plt.figure() # create figure
row_colum=[4,2]
ax0 = fig.add_subplot(row_colum[0], row_colum[1], 1) # add subplot 1 (1 row, 2 columns, first plot)
ax1 = fig.add_subplot(row_colum[0], row_colum[1], 2) # add subplot 2 (1 row, 2 columns, second plot). See tip below**
ax2 = fig.add_subplot(row_colum[0], row_colum[1], 3)
ax3 = fig.add_subplot(row_colum[0], row_colum[1], 4)
ax4 = fig.add_subplot(row_colum[0], row_colum[1], 5)
ax5 = fig.add_subplot(row_colum[0], row_colum[1], 6)
ax6 = fig.add_subplot(row_colum[0], row_colum[1], 7)
ax7 = fig.add_subplot(row_colum[0], row_colum[1], 8)
figsize=(20,25)
################ ax0
df_debit= df.loc[(df["year"]==2021) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum("amount").reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot= df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, xlim=(0,7500), ax= ax0)
for index, value in enumerate(df_barplot[2021]):
#label=format(int(value),",")
ax0.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
ax0.set_title("Sum of expenses in 2021 except Mortgage")
ax0.set_ylabel('')
############### ax1
df_debit= df.loc[(df["year"]==2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum("amount").reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot= df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, xlim=(0,7500), ax= ax1)
for index, value in enumerate(df_barplot[2022]):
#label=format(int(value),",")
ax1.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
ax1.set_title("Sum of expenses in 2022 except Mortgage")
ax1.set_ylabel('')
############### ax2
df_debit= df.loc[(df["year"]==2023) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum("amount").reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot= df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, xlim=(0,7500), ax= ax2)
for index, value in enumerate(df_barplot[2023]):
#label=format(int(value),",")
ax2.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
ax2.set_title("Sum of expenses in 2023 except Mortgage")
################ ax3
df_debit= df.loc[(df["year"]==2023) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum("amount").reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot= df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, xlim=(0,7500), ax= ax3)
for index, value in enumerate(df_barplot[2023]):
#label=format(int(value),",")
ax3.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
ax3.set_title("Sum of expenses in 2023 except Mortgage")
ax3.set_ylabel('')
################# ax4
chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] =="Grocery")].groupby(["day"])[["amount"]].sum("amount")
chart_df["amount"].plot(kind = "pie",
figsize=figsize,
labels = chart_df["amount"].round(2), # turn off labels on pie chart
#autopct='%0.02f%%',
shadow=True,
pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
explode=[0.1,0,0,0,0,0,0],# 'explode' lowest 3 continents
ax=ax4
)
# scale the title up by 12% to match pctdistance
ax4.set_title("Sum of Grocery expenses in 2022")
# add legend
ax4.legend(labels=chart_df.index, loc='upper left')
ax4.set_ylabel('')
#################### ax5
df_debit= df.loc[(df["year"]>=2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum("amount").reset_index("year")
df_barplot= df_debit.pivot(columns="year")["amount"]
df_barplot=df_barplot.fillna(0)
df_barplot.plot(kind="barh",figsize=figsize, ax=ax5)
for index, value in enumerate(df_barplot[2022]):
#label=format(int(value),",")
ax5.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="black")
for index, value in enumerate(df_barplot[2023]):
#label=format(int(value),",")
ax5.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index), color="black")
###################### ax6
sum_expenses=pd.concat([df.loc[df["debit_credit"]=="Debit"].groupby(["year"]).sum("amount"),numberofmonths_df],axis=1)
sum_expenses["averagedebit"]=sum_expenses["amount"]/sum_expenses["numberofmonths"]
#print("%.2f"%wm["averagedebit"][2022])
sum_expenses["averagedebit"].plot(kind="barh", figsize=figsize, ax=ax6)
for index, value in enumerate(sum_expenses["averagedebit"]):
#label=format(int(value),",")
ax6.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index), color="black")
ax6.set_title("Average expenses")
######################## ax7
pie_chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] !="Mortgage")].groupby(["day"])[["amount"]].sum("amount")
pie_chart_df["amount"].plot(kind = "pie",
figsize=figsize,
labels = None, # turn off labels on pie chart
autopct='%0.02f%%',
shadow=True,
pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
explode=[0.1,0,0,0,0,0,0],
ax= ax7
)
# scale the title up by 12% to match pctdistance
#ax2.title('XXXX', y=1.1)
# add legend
ax7.legend(labels=pie_chart_df.index, loc='upper left')
ax7.set_title("Sum of expenses for each day in 2022")
ax7.set_ylabel('')
plt.show()
fig, ax = plt.subplots(1,3, figsize=(15,5),sharey=True, sharex=True)
df_filter=df.loc[(df["debit_credit"]=="Debit") & (df["label"]=="Fuel")]
sns.barplot(x=df_filter["amount"],y=df_filter["month"],hue=df_filter["year"],estimator=sum, errorbar=None, ax=ax[0])
sns.barplot(x=df_filter["amount"],y=df_filter["month"],hue=df_filter["year"],estimator="max", errorbar=None, ax=ax[1])
sns.barplot(x=df_filter["amount"],y=df_filter["month"],hue=df_filter["year"],estimator="min", errorbar=None, ax=ax[2])
ax[0].set_title("Summation of Fuel Expenses")
ax[1].set_title("Maximum Value of Fuel Expenses")
ax[2].set_title("Minimum Value of Fuel Expenses")
Text(0.5, 1.0, 'Minimum Value of Fuel Expenses')
df.loc[df["description"].str.contains("SEPA")]
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 11 | 2021-07-12 | Monday | July | 2021 | ANAC | 34304.99 | 34189.39 | Debit | 115.60 | SEPA Incasso algemeen doorlopend Incassant: NL... | ABN AMRO | Insurance |
| 17 | 2021-07-15 | Thursday | July | 2021 | Other | 34051.19 | 34007.19 | Debit | 44.00 | SEPA iDEAL IBAN: NL56ING... | ABN AMRO | Online Shopping |
| 26 | 2021-07-21 | Wednesday | July | 2021 | YOUFONE | 33834.37 | 33823.37 | Debit | 11.00 | SEPA Incasso algemeen doorlopend Incassant: NL... | ABN AMRO | Youfone |
| 27 | 2021-07-21 | Wednesday | July | 2021 | YOUFONE | 33823.37 | 33812.87 | Debit | 10.50 | SEPA Incasso algemeen doorlopend Incassant: NL... | ABN AMRO | Youfone |
| 35 | 2021-07-23 | Friday | July | 2021 | SIOUX | 33747.17 | 38860.18 | Credit | 5113.01 | SEPA Overboeking IBAN: NL98RAB... | ABN AMRO | Income |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2184 | 2023-08-23 | Wednesday | August | 2023 | Youfone Nederland B.V. | 1302.86 | 1292.88 | Debit | 9.98 | Name: Youfone Nederland B.V. Description: 1019... | ING BANK | Youfone |
| 2185 | 2023-08-23 | Wednesday | August | 2023 | T-MOBILE THUIS B.V. | 1352.86 | 1302.86 | Debit | 50.00 | Name: T-MOBILE THUIS B.V. Description: T-Mobil... | ING BANK | House Fixed Exp. |
| 2190 | 2023-08-25 | Friday | August | 2023 | SIOUX | 8747.01 | 14484.15 | Credit | 5737.14 | /TRTP/SEPA OVERBOEKING/IBAN/NL98RABO0340894121... | ABN AMRO | Income |
| 2192 | 2023-08-25 | Friday | August | 2023 | ENNATUURLIJK BV | 1166.78 | 903.14 | Debit | 263.64 | Name: ENNATUURLIJK BV Description: Omschrijvin... | ING BANK | House Fixed Exp. |
| 2200 | 2023-08-28 | Monday | August | 2023 | GR-043-D | 10984.15 | 10924.15 | Debit | 60.00 | /TRTP/SEPA Incasso algemeen doorlopend/CSID/NL... | ABN AMRO | Car |
555 rows × 12 columns
df_debit= df.loc[(df["year"]>=2021) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label","year"]).sum("amount").reset_index("year")
df_barplot= df_debit.pivot(columns="year",values="amount")
df_barplot=df_barplot.fillna(0)
df_barplot
| year | 2021 | 2022 | 2023 |
|---|---|---|---|
| label | |||
| ABNAMRO Bank | 20.10 | 46.20 | 34.80 |
| Abroad | 129.50 | 584.11 | 1074.90 |
| Airplane | 0.00 | 75.00 | 2372.58 |
| Books&Toys | 163.80 | 547.75 | 94.73 |
| Car | 373.95 | 1279.98 | 1005.89 |
| Car Park | 50.40 | 163.21 | 356.11 |
| Cinema & Fun | 56.10 | 59.80 | 436.25 |
| Courses | 265.70 | 1833.95 | 459.20 |
| Decathlon | 315.94 | 821.68 | 117.92 |
| Fuel | 584.58 | 1412.53 | 821.49 |
| Furniture | 3423.87 | 0.00 | 0.00 |
| Grocery | 3001.08 | 6373.00 | 5368.87 |
| Health | 1.00 | 784.17 | 1223.10 |
| Holiday | 0.00 | 116.00 | 0.00 |
| Holland & Barrett | 58.26 | 301.27 | 33.03 |
| House Fixed Exp. | 762.27 | 4119.60 | 3546.66 |
| House Rent | 3885.00 | 0.00 | 0.00 |
| ING Bank | 5.02 | 138.50 | 33.20 |
| Insurance | 2457.13 | 5723.02 | 4432.79 |
| Kruidvat | 237.96 | 1503.70 | 845.38 |
| Money Transfer | 4544.00 | 562.49 | 1189.44 |
| Money Withdraw | 10.00 | 140.00 | 280.00 |
| Notary | 24704.72 | 0.00 | 0.00 |
| Online Shopping | 2717.13 | 5442.75 | 1753.80 |
| Other Expenses | 1974.47 | 2991.82 | 916.62 |
| REVOLUT | 0.00 | 7000.00 | 2480.00 |
| Restaurant | 550.50 | 1947.35 | 507.15 |
| Sioux | 64.50 | 1528.00 | 143.24 |
| Sport | 130.80 | 765.92 | 413.50 |
| Store | 3257.27 | 4638.13 | 4308.57 |
| Transportation | 43.75 | 130.00 | 60.00 |
| Turk market | 494.13 | 1794.30 | 299.40 |
| Youfone | 128.00 | 222.33 | 176.20 |
#df_debit= df.loc[(df["year"]>=2022) & (df["debit_credit"]=="Debit") & (df["label"] !="Mortgage")].groupby(["label"]).sum()
df_barplot.plot(kind="barh",figsize=(10,10), stacked=False, subplots=[(2021,),(2022,2023)], legend=True)
"""
for index, value in enumerate(df_barplot[2021]):
#label=format(int(value),",")
if value != 0:
plt.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.3), color="red")
for index, value in enumerate(df_barplot[2022]):
#label=format(int(value),",")
if value != 0:
plt.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.2), color="blue")
for index, value in enumerate(df_barplot[2023]):
#label=format(int(value),",")
if value != 0:
plt.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index+0.3), color="purple")
"""
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
pie_chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] !="Mortgage")].groupby(["day"])[["amount"]].sum()
pie_chart_df["amount"].plot(kind = "pie",
figsize=(15, 6),
labels = None, # turn off labels on pie chart
autopct='%0.02f%%',
shadow=True,
pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
explode=[0.1,0,0,0,0,0,0] # 'explode' lowest 3 continents
)
# scale the title up by 12% to match pctdistance
plt.title('XXXX', y=1.1)
# add legend
plt.legend(labels=chart_df.index, loc='upper left')
plt.title('Grocery', y=1.1)
plt.rcParams["figure.autolayout"] = True
plt.axis('equal')
plt.show()
chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] =="Grocery")].groupby(["day"])[["amount"]].count()
chart_df["amount"].plot(kind = "pie",
figsize=(15, 6),
labels = chart_df["amount"], # turn off labels on pie chart
#autopct='%0.02f%%',
shadow=True,
pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct
#colors=[], # add custom colors
explode=[0.1,0,0,0,0,0,0] # 'explode' lowest 3 continents
)
# scale the title up by 12% to match pctdistance
plt.title('Grocery', y=1.1)
#plt.rcParams["figure.autolayout"] = True
plt.axis('equal')
# add legend
plt.legend(labels=chart_df.index, loc='best')
plt.show()
chart_df
| amount | |
|---|---|
| day | |
| Monday | 30 |
| Tuesday | 29 |
| Wednesday | 24 |
| Thursday | 22 |
| Friday | 23 |
| Saturday | 18 |
| Sunday | 21 |
df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] =="Turk market")]
| date | day | month | year | name | startbalance | endbalance | debit_credit | amount | description | bankname | label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 535 | 2022-02-02 | Wednesday | February | 2022 | OZTURK FRESH FOOD | 7378.27 | 7272.28 | Debit | 105.99 | BEA NR:0QRZ02 02.02.22/17.41 Ozturk Fresh ... | ABN AMRO | Turk market |
| 540 | 2022-02-06 | Sunday | February | 2022 | IZMIR | 7109.99 | 7081.01 | Debit | 28.98 | BEA NR:071580EQ 06.02.22/17.41 Izmir Superma... | ABN AMRO | Turk market |
| 620 | 2022-03-05 | Saturday | March | 2022 | CAN SLAGERIJ | 6441.15 | 6430.35 | Debit | 10.80 | BEA NR:N615N8 05.03.22/13.34 Can Slagerij ... | ABN AMRO | Turk market |
| 624 | 2022-03-05 | Saturday | March | 2022 | OZTURK FRESH FOOD | 6430.35 | 6393.74 | Debit | 36.61 | BEA NR:0QRZ02 05.03.22/13.37 Ozturk Fresh ... | ABN AMRO | Turk market |
| 672 | 2022-03-17 | Thursday | March | 2022 | OZTURK FRESH FOOD | 4580.37 | 4535.90 | Debit | 44.47 | BEA NR:0QRZ02 17.03.22/18.22 Ozturk Fresh ... | ABN AMRO | Turk market |
| 675 | 2022-03-17 | Thursday | March | 2022 | CAN SLAGERIJ | 4623.37 | 4580.37 | Debit | 43.00 | BEA NR:N615N8 17.03.22/18.16 Can Slagerij ... | ABN AMRO | Turk market |
| 700 | 2022-03-26 | Saturday | March | 2022 | CAN SLAGERIJ | 9238.91 | 9219.71 | Debit | 19.20 | BEA NR:N615N8 26.03.22/13.13 Can Slagerij ... | ABN AMRO | Turk market |
| 701 | 2022-03-26 | Saturday | March | 2022 | OZTURK FRESH FOOD | 9219.71 | 9139.69 | Debit | 80.02 | BEA NR:0QRZ02 26.03.22/13.24 Ozturk Fresh ... | ABN AMRO | Turk market |
| 733 | 2022-04-06 | Wednesday | April | 2022 | IZMIR | 5855.03 | 5801.49 | Debit | 53.54 | BEA, Betaalpas Izmir Superma... | ABN AMRO | Turk market |
| 741 | 2022-04-10 | Sunday | April | 2022 | IZMIR | 5700.07 | 5690.11 | Debit | 9.96 | BEA, Betaalpas Izmir Superma... | ABN AMRO | Turk market |
| 749 | 2022-04-14 | Thursday | April | 2022 | CAN SLAGERIJ | 5477.69 | 5457.94 | Debit | 19.75 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 750 | 2022-04-14 | Thursday | April | 2022 | OZTURK FRESH FOOD | 5457.94 | 5396.23 | Debit | 61.71 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 786 | 2022-04-25 | Monday | April | 2022 | CAN SLAGERIJ | 8343.48 | 8306.88 | Debit | 36.60 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 788 | 2022-04-25 | Monday | April | 2022 | OZTURK FRESH FOOD | 8306.88 | 8186.74 | Debit | 120.14 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 849 | 2022-05-14 | Saturday | May | 2022 | CAN SLAGERIJ | 3725.55 | 3714.65 | Debit | 10.90 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 852 | 2022-05-14 | Saturday | May | 2022 | OZTURK FRESH FOOD | 3714.65 | 3623.57 | Debit | 91.08 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 865 | 2022-05-23 | Monday | May | 2022 | OZTURK FRESH FOOD | 3097.92 | 3056.72 | Debit | 41.20 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 866 | 2022-05-23 | Monday | May | 2022 | CAN SLAGERIJ | 3120.72 | 3097.92 | Debit | 22.80 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 965 | 2022-06-20 | Monday | June | 2022 | IZZET | 4649.61 | 4576.45 | Debit | 73.16 | BEA, Betaalpas Helal Slageri... | ABN AMRO | Turk market |
| 1020 | 2022-07-03 | Sunday | July | 2022 | IZMIR | 6097.30 | 6030.98 | Debit | 66.32 | BEA, Betaalpas Izmir Superma... | ABN AMRO | Turk market |
| 1083 | 2022-07-24 | Sunday | July | 2022 | IZMIR | 3934.67 | 3889.85 | Debit | 44.82 | BEA, Betaalpas IZMIR SUPERMA... | ABN AMRO | Turk market |
| 1135 | 2022-08-28 | Sunday | August | 2022 | IZMIR | 8840.42 | 8747.32 | Debit | 93.10 | BEA, Betaalpas Izmir Superma... | ABN AMRO | Turk market |
| 1167 | 2022-09-02 | Friday | September | 2022 | CAN SLAGERIJ | 5581.63 | 5542.43 | Debit | 39.20 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 1168 | 2022-09-02 | Friday | September | 2022 | OZTURK FRESH FOOD | 5542.43 | 5502.95 | Debit | 39.48 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 1186 | 2022-09-06 | Tuesday | September | 2022 | CAN SLAGERIJ | 4932.72 | 4875.77 | Debit | 56.95 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 1187 | 2022-09-06 | Tuesday | September | 2022 | OZTURK FRESH FOOD | 4875.77 | 4835.04 | Debit | 40.73 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 1208 | 2022-09-13 | Tuesday | September | 2022 | CAN SLAGERIJ | 4195.68 | 4178.13 | Debit | 17.55 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 1209 | 2022-09-13 | Tuesday | September | 2022 | OZTURK FRESH FOOD | 4178.13 | 4153.38 | Debit | 24.75 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 1249 | 2022-09-25 | Sunday | September | 2022 | IZMIR | 8113.19 | 8084.50 | Debit | 28.69 | BEA, Betaalpas IZMIR SUPERMA... | ABN AMRO | Turk market |
| 1294 | 2022-10-09 | Sunday | October | 2022 | IZMIR | 4771.55 | 4750.32 | Debit | 21.23 | BEA, Betaalpas IZMIR SUPERMA... | ABN AMRO | Turk market |
| 1295 | 2022-10-09 | Sunday | October | 2022 | IZMIR | 4787.55 | 4771.55 | Debit | 16.00 | BEA, Betaalpas IZMIR SUPERMA... | ABN AMRO | Turk market |
| 1317 | 2022-10-16 | Sunday | October | 2022 | IZMIR | 3958.00 | 3925.06 | Debit | 32.94 | BEA, Betaalpas Izmir Superma... | ABN AMRO | Turk market |
| 1339 | 2022-10-26 | Wednesday | October | 2022 | OZTURK FRESH FOOD | 8890.54 | 8820.75 | Debit | 69.79 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 1340 | 2022-10-26 | Wednesday | October | 2022 | CAN SLAGERIJ | 8903.49 | 8890.54 | Debit | 12.95 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 1341 | 2022-10-26 | Wednesday | October | 2022 | CAN SLAGERIJ | 8933.99 | 8903.49 | Debit | 30.50 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 1414 | 2022-11-22 | Tuesday | November | 2022 | OZTURK FRESH FOOD | 3486.63 | 3359.42 | Debit | 127.21 | BEA, Betaalpas Ozturk Fresh ... | ABN AMRO | Turk market |
| 1417 | 2022-11-22 | Tuesday | November | 2022 | CAN SLAGERIJ | 3527.43 | 3486.63 | Debit | 40.80 | BEA, Betaalpas Can Slagerij ... | ABN AMRO | Turk market |
| 1493 | 2022-12-24 | Saturday | December | 2022 | IZMIR | 8209.52 | 8128.14 | Debit | 81.38 | BEA, Betaalpas IZMIR SUPERMA... | ABN AMRO | Turk market |
df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] =="Turk market")].groupby("date").sum()["amount"].plot(kind="barh")
C:\Users\Erdinc\AppData\Local\Temp\ipykernel_1992\3625256799.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==2022) & (df["label"] =="Turk market")].groupby("date").sum()["amount"].plot(kind="barh")
<Axes: ylabel='date'>
df.loc[(df["debit_credit"]=="Debit") & (df["year"]==2022)].groupby("day")["amount"]\
.sum().sort_values(ascending=False).sort_values().plot(kind="barh",figsize = (5,10), title="Sum of Expenses according to Days,2022",grid=True)
plt.show()
with pd.ExcelWriter('output1.xlsx') as writer:
df.loc[(df["Debit/credit"]=="Debit") & (df["Year"]==2022) & (df["Month"]=="January")].to_excel(writer, sheet_name="all")
#df_income.to_excel(writer, sheet_name='income')
#df_expense.to_excel(writer, sheet_name='expense')
df_query=df.loc[(df["debit_credit"]=="Debit") & (df["year"]>=2021) & (df["label"]=="Fuel")].groupby(["year","month"])[["amount"]].sum("amount").reset_index()
df_query
| year | month | amount | |
|---|---|---|---|
| 0 | 2021 | January | 0.00 |
| 1 | 2021 | February | 0.00 |
| 2 | 2021 | March | 0.00 |
| 3 | 2021 | April | 0.00 |
| 4 | 2021 | May | 0.00 |
| 5 | 2021 | June | 0.00 |
| 6 | 2021 | July | 20.12 |
| 7 | 2021 | August | 57.28 |
| 8 | 2021 | September | 122.42 |
| 9 | 2021 | October | 148.42 |
| 10 | 2021 | November | 130.57 |
| 11 | 2021 | December | 105.77 |
| 12 | 2022 | January | 77.16 |
| 13 | 2022 | February | 77.69 |
| 14 | 2022 | March | 96.92 |
| 15 | 2022 | April | 136.81 |
| 16 | 2022 | May | 190.91 |
| 17 | 2022 | June | 194.24 |
| 18 | 2022 | July | 76.03 |
| 19 | 2022 | August | 100.66 |
| 20 | 2022 | September | 209.43 |
| 21 | 2022 | October | 60.17 |
| 22 | 2022 | November | 69.10 |
| 23 | 2022 | December | 123.41 |
| 24 | 2023 | January | 81.49 |
| 25 | 2023 | February | 143.69 |
| 26 | 2023 | March | 0.00 |
| 27 | 2023 | April | 35.76 |
| 28 | 2023 | May | 188.63 |
| 29 | 2023 | June | 175.66 |
| 30 | 2023 | July | 131.24 |
| 31 | 2023 | August | 65.02 |
| 32 | 2023 | September | 0.00 |
| 33 | 2023 | October | 0.00 |
| 34 | 2023 | November | 0.00 |
| 35 | 2023 | December | 0.00 |
df_query=df_query.pivot(index="month", columns="year",values="amount").T
df_query
| month | January | February | March | April | May | June | July | August | September | October | November | December |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | ||||||||||||
| 2021 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 20.12 | 57.28 | 122.42 | 148.42 | 130.57 | 105.77 |
| 2022 | 77.16 | 77.69 | 96.92 | 136.81 | 190.91 | 194.24 | 76.03 | 100.66 | 209.43 | 60.17 | 69.10 | 123.41 |
| 2023 | 81.49 | 143.69 | 0.00 | 35.76 | 188.63 | 175.66 | 131.24 | 65.02 | 0.00 | 0.00 | 0.00 | 0.00 |
df_query.plot(kind="barh",figsize=(10,15),subplots=True,title="Fuel Consumption for Every Month in Each Year")
df_query
| month | January | February | March | April | May | June | July | August | September | October | November | December |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | ||||||||||||
| 2021 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 20.12 | 57.28 | 122.42 | 148.42 | 130.57 | 105.77 |
| 2022 | 77.16 | 77.69 | 96.92 | 136.81 | 190.91 | 194.24 | 76.03 | 100.66 | 209.43 | 60.17 | 69.10 | 123.41 |
| 2023 | 81.49 | 143.69 | 0.00 | 35.76 | 188.63 | 175.66 | 131.24 | 65.02 | 0.00 | 0.00 | 0.00 | 0.00 |
df_query=df.loc[(df["debit_credit"]=="Debit") & (df["year"]>=2021) & (df["label"]=="Fuel")].groupby(["year","month"])[["amount"]].sum("amount").reset_index()
df_query
| year | month | amount | |
|---|---|---|---|
| 0 | 2021 | January | 0.00 |
| 1 | 2021 | February | 0.00 |
| 2 | 2021 | March | 0.00 |
| 3 | 2021 | April | 0.00 |
| 4 | 2021 | May | 0.00 |
| 5 | 2021 | June | 0.00 |
| 6 | 2021 | July | 20.12 |
| 7 | 2021 | August | 57.28 |
| 8 | 2021 | September | 122.42 |
| 9 | 2021 | October | 148.42 |
| 10 | 2021 | November | 130.57 |
| 11 | 2021 | December | 105.77 |
| 12 | 2022 | January | 77.16 |
| 13 | 2022 | February | 77.69 |
| 14 | 2022 | March | 96.92 |
| 15 | 2022 | April | 136.81 |
| 16 | 2022 | May | 190.91 |
| 17 | 2022 | June | 194.24 |
| 18 | 2022 | July | 76.03 |
| 19 | 2022 | August | 100.66 |
| 20 | 2022 | September | 209.43 |
| 21 | 2022 | October | 60.17 |
| 22 | 2022 | November | 69.10 |
| 23 | 2022 | December | 123.41 |
| 24 | 2023 | January | 81.49 |
| 25 | 2023 | February | 143.69 |
| 26 | 2023 | March | 0.00 |
| 27 | 2023 | April | 35.76 |
| 28 | 2023 | May | 188.63 |
| 29 | 2023 | June | 175.66 |
| 30 | 2023 | July | 131.24 |
| 31 | 2023 | August | 65.02 |
| 32 | 2023 | September | 0.00 |
| 33 | 2023 | October | 0.00 |
| 34 | 2023 | November | 0.00 |
| 35 | 2023 | December | 0.00 |
df_query=df_query.pivot(index="year", columns="month",values="amount").T
df_query
| year | 2021 | 2022 | 2023 |
|---|---|---|---|
| month | |||
| January | 0.00 | 77.16 | 81.49 |
| February | 0.00 | 77.69 | 143.69 |
| March | 0.00 | 96.92 | 0.00 |
| April | 0.00 | 136.81 | 35.76 |
| May | 0.00 | 190.91 | 188.63 |
| June | 0.00 | 194.24 | 175.66 |
| July | 20.12 | 76.03 | 131.24 |
| August | 57.28 | 100.66 | 65.02 |
| September | 122.42 | 209.43 | 0.00 |
| October | 148.42 | 60.17 | 0.00 |
| November | 130.57 | 69.10 | 0.00 |
| December | 105.77 | 123.41 | 0.00 |
df_query.plot(kind="barh",figsize=(10,15),subplots=True,title="Fuel Consumption for Every Year in Each Month")
df_query
| year | 2021 | 2022 | 2023 |
|---|---|---|---|
| month | |||
| January | 0.00 | 77.16 | 81.49 |
| February | 0.00 | 77.69 | 143.69 |
| March | 0.00 | 96.92 | 0.00 |
| April | 0.00 | 136.81 | 35.76 |
| May | 0.00 | 190.91 | 188.63 |
| June | 0.00 | 194.24 | 175.66 |
| July | 20.12 | 76.03 | 131.24 |
| August | 57.28 | 100.66 | 65.02 |
| September | 122.42 | 209.43 | 0.00 |
| October | 148.42 | 60.17 | 0.00 |
| November | 130.57 | 69.10 | 0.00 |
| December | 105.77 | 123.41 | 0.00 |
df_query=df.loc[(df["debit_credit"]=="Debit") & (df["year"]==2021) & (df["label"]=="Fuel")].groupby("month")["amount"].sum()
df_query.plot(kind="barh",figsize = (8,8), title="Fuel Expenses according to Months, 2021")
for index, value in enumerate(df_query):
#label=format(int(value),",")
if value != 0:
plt.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.1), color="black")
plt.show()
df_query
month January 0.00 February 0.00 March 0.00 April 0.00 May 0.00 June 0.00 July 20.12 August 57.28 September 122.42 October 148.42 November 130.57 December 105.77 Name: amount, dtype: float64
df_query=df.loc[(df["debit_credit"]=="Debit") & (df["year"]==2022) & (df["label"]=="Fuel")].groupby("month")["amount"].sum()
df_query.plot(kind="barh",figsize = (8,8), title="Fuel Expenses according to Months, 2022")
for index, value in enumerate(df_query):
#label=format(int(value),",")
if value != 0:
plt.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.1), color="black")
plt.show()
df_query
month January 77.16 February 77.69 March 96.92 April 136.81 May 190.91 June 194.24 July 76.03 August 100.66 September 209.43 October 60.17 November 69.10 December 123.41 Name: amount, dtype: float64
df_query=df.loc[(df["debit_credit"]=="Debit") & (df["year"]==2023) & (df["label"]=="Fuel")].groupby("month")["amount"].sum()
df_query.plot(kind="barh",figsize = (8,8), title="Fuel Expenses according to Months, 2023")
for index, value in enumerate(df_query):
#label=format(int(value),",")
if value != 0:
plt.annotate("\N{euro sign} "+ str(int(value)), xy=(value, index-0.1), color="black")
plt.show()
df_query
month January 81.49 February 143.69 March 0.00 April 35.76 May 188.63 June 175.66 July 131.24 August 65.02 September 0.00 October 0.00 November 0.00 December 0.00 Name: amount, dtype: float64
months = dict(df.groupby("year")["month"].unique())
numberofMonths = dict()
for i in months:
numberofMonths[i] = len(months[i])
grocery_mean=df.loc[(df["debit_credit"]=="Debit") & (df["year"]==2022) & (df["label"]=="Grocery")].groupby(["year","month"])["amount"].sum()
grocery_mean.plot(kind="barh",figsize = (5,10), title="Grocery Expenses according to Months, 2022")
plt.show()
grocery_mean
year month
2022 January 756.04
February 483.65
March 443.28
April 472.95
May 569.48
June 537.80
July 499.67
August 258.11
September 657.17
October 445.22
November 557.98
December 691.65
Name: amount, dtype: float64
#super market ve turkmarket monthly average
turk=[]
grocery=[]
months = dict(df.groupby("year")["month"].unique())
numberofMonths = dict()
for i in months:
numberofMonths[i] = len(months[i])
for i in numberofMonths.keys():
print("For year ",i)
df_groc=df.loc[(df["debit_credit"]=="Debit") & (df["year"]==i) & (df["label"]=="Grocery")].groupby("year")["amount"].sum()/numberofMonths[i]
df_turk= (df.loc[(df["debit_credit"]=="Debit") & (df["year"]==i) & (df["label"]=="Turk market")].groupby("year")["amount"].sum()/numberofMonths[i])
turk.append(df_turk.iloc[0])
grocery.append(df_groc.iloc[0])
print("Turk Market: \N{euro sign}{:.2f} ".format(df_turk.iloc[0]))
print("Grocery: \N{euro sign}{:.2f}".format(df_groc.iloc[0]))
market_mean = {"Year":list(numberofMonths.keys()),
"Turk Market":turk,
"Grocery":grocery}
market_mean_df=pd.DataFrame(market_mean,index=market_mean["Year"])
market_mean_df.drop(columns=["Year"],axis= 0).plot(kind="barh")
market_mean_df
#for i in numberofMonths.keys():
# print("For year ",i)
# df_turk= (df.loc[(df["debit_credit"]=="Debit") & (df["year"]==i) & (df["label"]=="Turk market")].groupby("year")["amount"].sum()/numberofMonths[i]).sort_values(ascending=False)
# print("Turk Market:\n {}".format(df_turk))
For year 2021 Turk Market: €82.36 Grocery: €500.18 For year 2022 Turk Market: €149.53 Grocery: €531.08 For year 2023 Turk Market: €37.42 Grocery: €671.11
| Year | Turk Market | Grocery | |
|---|---|---|---|
| 2021 | 2021 | 82.355 | 500.180000 |
| 2022 | 2022 | 149.525 | 531.083333 |
| 2023 | 2023 | 37.425 | 671.108750 |
df.loc[(df["debit_credit"]=="Debit") & (df["year"]>=2022)].groupby(["day","year"])[["amount"]].aggregate(["mean", max,np.sum]).T
| day | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| year | 2022 | 2023 | 2022 | 2023 | 2022 | 2023 | 2022 | 2023 | 2022 | 2023 | 2022 | 2023 | 2022 | 2023 | |
| amount | mean | 86.713988 | 121.327714 | 112.79305 | 88.334952 | 59.021727 | 80.455 | 96.388 | 54.812 | 62.017432 | 53.959873 | 36.716646 | 45.962115 | 34.48901 | 27.092714 |
| max | 1479.890000 | 1479.890000 | 1479.89000 | 1479.890000 | 1479.890000 | 1479.890 | 2500.000 | 1479.890 | 1479.890000 | 263.640000 | 644.000000 | 1044.000000 | 267.46000 | 162.610000 | |
| sum | 15001.520000 | 12739.410000 | 15903.82000 | 9275.170000 | 8204.020000 | 9010.960 | 13012.380 | 4659.020 | 9178.580000 | 4262.830000 | 6021.530000 | 4780.060000 | 3483.39000 | 1896.490000 | |
with pd.ExcelWriter('output.xlsx') as writer:
df.to_excel(writer, sheet_name="all")
#df_income.to_excel(writer, sheet_name='income')
#df_expense.to_excel(writer, sheet_name='expense')
from dash import dash, html, dcc, Input, Output, dash_table
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date
import plotly.express as px
#Read the data
df=pd.read_csv(r"D:\00_BACKUP_BY_DRIVE\Bank_expense_analysis\expenses.csv")
# Create a dash application
app= dash.Dash(__name__)
# Get the layout of the application and adjust it.
# Create an outer division using html.Div and add title to the dashboard using html.H1 component
# Add a html.Div and core input text component
# Finally, add graph component.
app.layout = html.Div([
html.H1('Descriptive Analysis of Expenses in the Netherlands',style={'textAlign':'center'}),
html.Div([
html.Div([
dcc.Dropdown(id='year-drop-down',options=df["year"].unique(), value=2022,
multi=False,placeholder='Select a Year', style={'width':'100%','padding':'3px', 'text-align-last' : 'center'}),
dcc.Dropdown(id='label-drop-down',options=df["label"].unique(), value=["Grocery", "Turk market"],
multi=True,placeholder='Select a Label',style={'width':'100%','padding':'3px', 'text-align-last' : 'center'}),
dcc.Dropdown(id='month-drop-down',options=["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"],
multi=False,placeholder='Select a Month',style={'width':'100%','padding':'3px', 'text-align-last' : 'center'}),
dcc.Dropdown(id='day-drop-down',options=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
multi=False,placeholder='Select a Day',style={'width':'100%','padding':'3px', 'text-align-last' : 'center'}),
], style={'display': 'flex','width':'100%'}),
#dash_table.DataTable(data=df.to_dict('records'), page_size=10,column_selectable=True),
html.Div([
html.Div([],id='plot-1'),
html.Div([],id='plot-2')
], style={'display': 'flex','textAlign':'center'}),
]),
])
@app.callback([Output(component_id="plot-1",component_property="children"),
Output(component_id="plot-2",component_property="children")],
[Input(component_id="year-drop-down",component_property="value"),
Input(component_id="label-drop-down",component_property="value")])
def get_graph(year,label):
label_txt = ""
for i in range(0,len(label)):
if i < (len(label)-1):
label_txt = label_txt + label[i] + "|"
print(label_txt)
else:
label_txt += label[i]
print(label[i])
print(label_txt)
print(label)
df_query=df.loc[(df["debit_credit"]=="Debit") & (df["year"]==year) & (df["label"].str.contains(label_txt,case=False))].groupby(["month"])[["month","amount"]].sum("amount")
bar_data=df_query.reset_index().sort_values(by="amount",ascending=False)
print(bar_data)
bar_fig= px.bar(bar_data,x="month",y="amount", title=label_txt + " Expenses in "+ str(year))
chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] == year) & (df["label"].str.contains(label_txt,case=False))].groupby(["day"])[["amount"]].sum()
pie_fig = px.pie(chart_df.reset_index(),values= "amount", names="day", title= label_txt + " Expenses according to days in "+ str(year))
return [dcc.Graph(figure=bar_fig),dcc.Graph(figure=pie_fig)]
if __name__ == '__main__':
app.run_server()
from dash import dash, html, dcc, Input, Output
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date
import plotly.express as px
#Read the data
df=pd.read_csv(r"D:\00_BACKUP_BY_DRIVE\Bank_expense_analysis\expenses.csv")
# Create a dash application
app = dash.Dash(__name__)
# Create an app layout
app.layout = html.Div([
html.H1('Descriptive Analyses of Expenses', style={'textAlign': 'center', 'color': '#503D36','childrennt-size': 40}),
html.Div([
html.Div([
dcc.DatePickerRange(
id='my-date-picker-range',
min_date_allowed=df["date"].min(),
max_date_allowed=df["date"].max(),
start_date=df["date"].min(),
end_date=df["date"].max()),
html.Div([
html.Div([
html.H2('Select Year:', style={'margin-right': '2em'})
]),
dcc.Dropdown(
id='year-drop-down',
options=df['year'].unique(),
#value="2022",
multi=False,
placeholder='Select a year',
style={'width':'80%', 'padding':'3px', 'font-size': '20px', 'text-align-last' : 'center'}
)], style={'display':'flex'})
]),
dcc.Dropdown(
id='month-drop-down',
options=df['month'].unique(),
value=df["month"].unique(),
multi=False,
placeholder='Select month(s)'
),
dcc.Dropdown(
id='day-drop-down',
options=df['day'].unique(),
value=df["day"].unique(),
multi=False,
placeholder='Select day(s)'
),
],style={'width': '49%', 'display': 'inline-block'}),
html.Div([
dcc.Dropdown(
id='label-drop-down',
options=np.sort(df['label'].unique()),
value="Grocery",
),
dcc.RadioItems(
['Linear', 'Log'],
'Linear',
id='crossfilter-yaxis-type',
labelStyle={'display': 'inline-block', 'marginTop': '5px'}
),
html.Br(),
dcc.Slider(
df['year'].min(),
df['year'].max(),
step=None,
id='slider',
value=df['year'].max(),
marks={str(year): str(year) for year in df['year'].unique()}
)], style={'width': '49%', 'float': 'right', 'display': 'inline-block'}),
html.Div([], id="graph")
], style={'padding': '10px 5px'})
@app.callback(Output(component_id="graph", component_property="children"),
Input(component_id="year-drop-down", component_property="value"),
Input(component_id="label-drop-down", component_property="value"))
def get_pie_chart(year, label):
chart_df = df.loc[(df["debit_credit"]=="Debit") & (df["year"] ==year) & (df["label"] == label)].groupby(["day"])[["amount"]].sum()
fig= px.pie(data_frame=chart_df, values="amount")
return dcc.Graph(figure=fig)
if __name__ == '__main__':
app.run_server()